In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
In [42]:
from sklearn.cluster import KMeans
In [43]:
data = pd.read_csv("data_mete.csv",delimiter=",")
In [44]:
data.shape
Out[44]:
(139123, 4)
In [45]:
data['recency']=data['recency'].apply(np.log)
In [46]:
data['frequency']=data['frequency'].apply(np.log)
In [47]:
data['total_charges']=data['total_charges'].apply(np.log)
In [48]:
data.head()
Out[48]:
Unnamed: 0 recency frequency total_charges
0 0 5.805135 2.397895 6.830518
1 1 5.710427 2.890372 7.136221
2 2 6.501290 1.386294 7.057416
3 3 6.498282 1.945910 7.325050
4 4 5.541264 2.484907 7.304281
In [49]:
data.drop(data.columns[0],axis=1,inplace=True)
In [50]:
data.columns=['recency','frequency','monetary']
In [51]:
data.head()
Out[51]:
recency frequency monetary
0 5.805135 2.397895 6.830518
1 5.710427 2.890372 7.136221
2 6.501290 1.386294 7.057416
3 6.498282 1.945910 7.325050
4 5.541264 2.484907 7.304281
In [52]:
data.isna().sum(axis=0)
Out[52]:
recency      2
frequency    0
monetary     0
dtype: int64
In [53]:
data.dropna()
Out[53]:
recency frequency monetary
0 5.805135 2.397895 6.830518
1 5.710427 2.890372 7.136221
2 6.501290 1.386294 7.057416
3 6.498282 1.945910 7.325050
4 5.541264 2.484907 7.304281
... ... ... ...
139118 1.386294 0.000000 3.452207
139119 1.386294 0.000000 3.619529
139120 1.386294 0.000000 3.157851
139121 1.386294 0.000000 3.619529
139122 1.386294 0.000000 3.619529

139121 rows × 3 columns

In [54]:
data_copy = data.copy()
In [55]:
quantiles = data.quantile(q=[0.25,0.5,0.75])
In [56]:
quantiles
Out[56]:
recency frequency monetary
0.25 4.343805 0.693147 5.225585
0.50 5.446737 0.693147 5.786959
0.75 5.948035 1.098612 6.240091
In [57]:
quantiles.to_dict()
Out[57]:
{'recency': {0.25: 4.343805421853684,
  0.5: 5.44673737166631,
  0.75: 5.948034989180646},
 'frequency': {0.25: 0.6931471805599453,
  0.5: 0.6931471805599453,
  0.75: 1.0986122886681098},
 'monetary': {0.25: 5.225585370381938,
  0.5: 5.786958729178144,
  0.75: 6.240090642789175}}
In [58]:
def R_Score(x,p,d): # x=value , p=recency,monetary,frequancy, d= quartiles dict
    if x <= d[p][0.25]:
        return 4
    elif x <= d[p][0.50]:
        return 3
    elif x <= d[p][0.75]:
        return 2
    else:
        return 1
In [59]:
def FM_Score(x,p,d): # x=value , p=recency,monetary,frequancy, d= quartiles dict
    if x <= d[p][0.25]:
        return 1
    elif x <= d[p][0.50]:
        return 2
    elif x <= d[p][0.75]:
        return 3
    else:
        return 4
In [60]:
rfm_table = data
rfm_table['R_Quartile'] = rfm_table['recency'].apply(
    R_Score,args = ('recency',quantiles))
rfm_table['F_Quartile'] = rfm_table['frequency'].apply(
    FM_Score,args=('frequency',quantiles))
rfm_table['M_Quartile'] = rfm_table['monetary'].apply(
    FM_Score,args=('monetary',quantiles))
In [61]:
rfm_table
Out[61]:
recency frequency monetary R_Quartile F_Quartile M_Quartile
0 5.805135 2.397895 6.830518 2 4 4
1 5.710427 2.890372 7.136221 2 4 4
2 6.501290 1.386294 7.057416 1 4 4
3 6.498282 1.945910 7.325050 1 4 4
4 5.541264 2.484907 7.304281 2 4 4
... ... ... ... ... ... ...
139118 1.386294 0.000000 3.452207 4 1 1
139119 1.386294 0.000000 3.619529 4 1 1
139120 1.386294 0.000000 3.157851 4 1 1
139121 1.386294 0.000000 3.619529 4 1 1
139122 1.386294 0.000000 3.619529 4 1 1

139123 rows × 6 columns

In [62]:
rfm_table['RFMScore'] = rfm_table.R_Quartile.map(str) \
                            + rfm_table.F_Quartile.map(str) \
                            + rfm_table.M_Quartile.map(str)
In [63]:
rfm_table
Out[63]:
recency frequency monetary R_Quartile F_Quartile M_Quartile RFMScore
0 5.805135 2.397895 6.830518 2 4 4 244
1 5.710427 2.890372 7.136221 2 4 4 244
2 6.501290 1.386294 7.057416 1 4 4 144
3 6.498282 1.945910 7.325050 1 4 4 144
4 5.541264 2.484907 7.304281 2 4 4 244
... ... ... ... ... ... ... ...
139118 1.386294 0.000000 3.452207 4 1 1 411
139119 1.386294 0.000000 3.619529 4 1 1 411
139120 1.386294 0.000000 3.157851 4 1 1 411
139121 1.386294 0.000000 3.619529 4 1 1 411
139122 1.386294 0.000000 3.619529 4 1 1 411

139123 rows × 7 columns

In [64]:
#Best Customers
rfm_table[rfm_table['RFMScore']=='444'].sort_values(
    'monetary',ascending=False)
Out[64]:
recency frequency monetary R_Quartile F_Quartile M_Quartile RFMScore
117987 -inf 5.971262 9.038962 4 4 4 444
7041 1.098612 3.891820 9.025439 4 4 4 444
38904 2.639057 5.017280 8.996729 4 4 4 444
117359 4.110874 5.318120 8.594147 4 4 4 444
38369 4.189655 2.302585 8.486936 4 4 4 444
... ... ... ... ... ... ... ...
48349 3.044522 2.564949 6.241484 4 4 4 444
9835 2.639057 2.079442 6.241172 4 4 4 444
54038 1.386294 2.302585 6.241075 4 4 4 444
20533 3.555348 1.791759 6.240471 4 4 4 444
86517 3.178054 2.079442 6.240412 4 4 4 444

4824 rows × 7 columns

In [65]:
rfm_table['Total Score'] = rfm_table['R_Quartile']+rfm_table['F_Quartile']+rfm_table['M_Quartile']
In [66]:
rfm_table.sort_values('Total Score',ascending=False)
Out[66]:
recency frequency monetary R_Quartile F_Quartile M_Quartile RFMScore Total Score
8421 2.890372 2.708050 7.572909 4 4 4 444 12
85079 1.945910 2.397895 6.918230 4 4 4 444 12
24317 3.806662 1.386294 6.273405 4 4 4 444 12
36525 4.043051 2.197225 7.317425 4 4 4 444 12
7587 2.639057 1.945910 6.294491 4 4 4 444 12
... ... ... ... ... ... ... ... ...
35077 6.502790 0.693147 2.484073 1 1 1 111 3
13260 5.993961 0.693147 5.202192 1 1 1 111 3
117632 6.006353 0.000000 5.025064 1 1 1 111 3
79950 6.052089 0.000000 3.273364 1 1 1 111 3
82947 5.963579 0.693147 4.823181 1 1 1 111 3

139123 rows × 8 columns

In [67]:
rfm_table[rfm_table['RFMScore']=='444']
Out[67]:
recency frequency monetary R_Quartile F_Quartile M_Quartile RFMScore Total Score
5 1.945910 2.302585 6.528659 4 4 4 444 12
10 2.890372 1.386294 6.604134 4 4 4 444 12
20 3.295837 2.397895 7.120226 4 4 4 444 12
23 3.784190 3.178054 7.716679 4 4 4 444 12
55 2.564949 2.708050 6.742822 4 4 4 444 12
... ... ... ... ... ... ... ... ...
132655 2.397895 2.708050 6.537054 4 4 4 444 12
133013 2.079442 3.332205 6.427281 4 4 4 444 12
133081 4.248495 1.945910 6.844144 4 4 4 444 12
133974 2.197225 3.332205 6.495522 4 4 4 444 12
134015 3.761200 1.791759 6.316298 4 4 4 444 12

4824 rows × 8 columns

In [68]:
def rfm_level(data):
    if data['Total Score'] >= 9:
        return 'Can\'t Loose Them'
    elif (data['Total Score']>= 8) and (data['Total Score']<9):
        return 'Champions'
    elif (data['Total Score']>= 7) and (data['Total Score']<8):
        return 'Loyals'
    elif ((data['Total Score'] >= 6) and (data['Total Score'] < 7)):
        return 'Potential'
    elif ((data['Total Score'] >= 5) and (data['Total Score'] < 6)):
        return 'Promising'
    elif ((data['Total Score'] >= 4) and (data['Total Score'] < 5)):
        return 'Needs Attention'
    else:
        return 'Require Activation'
In [69]:
rfm_table['RFM Level'] = rfm_table.apply(rfm_level,axis=1)
In [70]:
rfm_table
Out[70]:
recency frequency monetary R_Quartile F_Quartile M_Quartile RFMScore Total Score RFM Level
0 5.805135 2.397895 6.830518 2 4 4 244 10 Can't Loose Them
1 5.710427 2.890372 7.136221 2 4 4 244 10 Can't Loose Them
2 6.501290 1.386294 7.057416 1 4 4 144 9 Can't Loose Them
3 6.498282 1.945910 7.325050 1 4 4 144 9 Can't Loose Them
4 5.541264 2.484907 7.304281 2 4 4 244 10 Can't Loose Them
... ... ... ... ... ... ... ... ... ...
139118 1.386294 0.000000 3.452207 4 1 1 411 6 Potential
139119 1.386294 0.000000 3.619529 4 1 1 411 6 Potential
139120 1.386294 0.000000 3.157851 4 1 1 411 6 Potential
139121 1.386294 0.000000 3.619529 4 1 1 411 6 Potential
139122 1.386294 0.000000 3.619529 4 1 1 411 6 Potential

139123 rows × 9 columns

In [71]:
rfm_table["RFM Level"].value_counts()
Out[71]:
Potential             48364
Promising             35543
Can't Loose Them      34509
Champions              9158
Needs Attention        5710
Loyals                 5631
Require Activation      208
Name: RFM Level, dtype: int64
In [72]:
rfm_level_agg = rfm_table.groupby('RFM Level').agg({
    'recency': 'mean',
    'frequency': 'mean',
    'monetary': ['mean', 'count']}).round(1)
In [73]:
print(rfm_level_agg)
                   recency frequency monetary       
                      mean      mean     mean  count
RFM Level                                           
Can't Loose Them      -inf      -inf      6.3  34509
Champions             -inf       1.1      5.8   9158
Loyals                -inf       0.8      5.9   5631
Needs Attention        6.0       0.6      5.4   5710
Potential             -inf       0.6      5.4  48364
Promising              5.6       0.6      5.5  35543
Require Activation     6.2       0.5      4.7    208
In [74]:
import squarify
#rfm_level_agg.columns = rfm_level_agg.columns.droplevel()
rfm_level_agg.columns = ['RecencyMean','FrequencyMean','MonetaryMean', 'Count']
#Create our plot and resize it.
fig = plt.gcf()
ax = fig.add_subplot()
fig.set_size_inches(16, 9)
squarify.plot(sizes=rfm_level_agg['Count'], 
              label=['Can\'t Loose Them',
                     'Champions',
                     'Loyal',
                     'Needs Attention',
                     'Potential', 
                     'Promising', 
                     'Require Activation'], alpha=.6 )
plt.title("RFM Segments",fontsize=18,fontweight="bold")
plt.axis('off')
plt.show()
In [75]:
x3 = data[['R_Quartile','F_Quartile','M_Quartile']].iloc[:,:].values
inertia = []
inertia = []
for n in range(1 , 11):
    algorithm = (KMeans(n_clusters = n ,init='k-means++', n_init = 10 ,max_iter=300, 
                        tol=0.0001,  random_state= 111  , algorithm='full') )
    algorithm.fit(x3)
    inertia.append(algorithm.inertia_)
In [76]:
plt.figure(1 , figsize = (15 ,6))
plt.plot(np.arange(1 , 11) , inertia , 'o')
plt.plot(np.arange(1 , 11) , inertia , '-' , alpha = 0.5)
plt.xlabel('Number of Clusters') , plt.ylabel('Inertia')
plt.show()

k=3

In [77]:
new_data = data['RFM Level']

new_data = np.array(new_data)
type(new_data)
new_data
Out[77]:
array(["Can't Loose Them", "Can't Loose Them", "Can't Loose Them", ...,
       'Potential', 'Potential', 'Potential'], dtype=object)
In [78]:
algorithm = (KMeans(n_clusters = 3 ,init='k-means++', n_init = 10 ,max_iter=300, 
                        tol=0.0001,  random_state= 111  , algorithm='elkan') )
algorithm.fit(x3)
labels3 = algorithm.labels_
centroids3 = algorithm.cluster_centers_
In [79]:
import plotly.graph_objs as go
import plotly as py
In [80]:
def levels(data) :
    
    if data['RFM Level']=="Can't Loose Them" :
        return 1
    elif data['RFM Level']=="Champions":
        return 2
    elif data['RFM Level']=="Loyal" :
        return 3
    elif data['RFM Level']=="Needs Attention" :
        return 4
    elif data['RFM Level']=="Potential" :
        return 5
    elif data['RFM Level']=="Promising" :
        return 6
    else:
        return 7
data["RFM Level"] = data.apply(lambda data:levels(data),
                                      axis = 1)
In [81]:
data['label3'] =  labels3
trace1 = go.Scatter3d(
    x= data['recency'],
    y= data['frequency'],
    z= data['monetary'],
    mode='markers',
     marker=dict(
        color = data['RFM Level'], 
        size= 1,
        line=dict(
            color= data['RFM Level'],
            width= 12
        ),
        opacity=0.8
     )
)
data_temp = [trace1]
layout = go.Layout(
#     margin=dict(
#         l=0,
#         r=0,
#         b=0,
#         t=0
#     )
    title= 'Clusters',
    scene = dict(
            xaxis = dict(title  = 'R'),
            yaxis = dict(title  = 'F'),
            zaxis = dict(title  = 'M')
        )
)
fig = go.Figure(data=data_temp, layout=layout)
py.offline.iplot(fig)
In [ ]: